Start by doing today's quiz (Review Day 3)
row = 'sofa|2000|buy|Uppsala'
fields = row.split('|')
price = int(fields[1])
if price == 2000:
print('The price is a number!')
if price == '2000':
print('The price is a string!')
print(sorted([ 2000, 30, 100 ]))
print(sorted(['2000', '30', '100']))
# Hint: is `'30' > '2000'`?
Each type store a specific type of information
int for integers,float for floating point values (decimals),str for strings,list for lists,dict for dictionaries.Each type supports different operations, functions and methods.
30 > 2000
'30' > '2000'
30 > int('2000')
'ACTG'.lower()
[1, 2, 3].lower()
'2000' and '0.5' and '1e9'int('2000')
int('0.5')
int('1e9')
float('2000')
float('1.5')
float('1e9')
int(float('1e9'))
1, 0, '1', '0', '', {}bool(1)
bool(0)
bool('1')
bool('0')
bool('')
bool({})
values = [1, 0, '', '0', '1', [], [0]]
for x in values:
if x:
print(repr(x), 'is true!')
else:
print(repr(x), 'is false!')
list("hello")
str(['h', 'e', 'l', 'l', 'o'])
'_'.join(['h', 'e', 'l', 'l', 'o'])
genre_list = ["comedy", "drama", "drama", "sci-fi"]
genre_list
genres = set(genre_list)
'drama' in genres
genre_counts = {"comedy": 1, "drama": 2, "sci-fi": 1}
genre_counts
movie = {"rating": 10.0, "title": "Toy Story"}
movie
def hello_function(number):
# print the user input
print(number)
number += 2
return 2
A function that counts the number of occurences of 'C' in the argument string.
def cytosine_count(nucleotides):
count = 0
for x in nucleotides:
if x == 'c' or x == 'C':
count += 1
return count
count1 = cytosine_count('CATATTAC')
count2 = cytosine_count('tagtag')
print(count1, count2)
return are easier to repurpose than those that print their resultcytosine_count('catattac') + cytosine_count('tactactac')
def print_cytosine_count(nucleotides):
count = 0
for x in nucleotides:
if x == 'c' or x == 'C':
count += 1
print(count)
print_cytosine_count('catattac') + print_cytosine_count('tactactac')
list_A = ['red', 'green']
list_B = ['red', 'green']
list_B.append('blue')
print(list_A, list_B)
list_A = ['red', 'green']
list_B = list_A # another name to the SAME list. Aliasing
list_B.append('blue')
print(list_A, list_B)
list_A = ['red', 'green']
list_B = list_A
list_A = []
print(list_A, list_B)
list_A = ['red', 'green']
lists = {'A': list_A, 'B': list_A}
print(lists)
lists['B'].append('blue')
print(lists)
list_A = ['red', 'green']
lists = {'A': list_A, 'B': list_A}
print(lists)
lists['B'] = lists['B'] + ['yellow']
print(lists)
movies = ['Toy story', 'Home alone']
def some_thriller_movies():
return ['Fargo', 'The Usual Suspects']
movies = some_thriller_movies()
print(movies)
def change_to_drama(movies):
movies = ['Forrest Gump', 'Titanic']
change_to_drama(movies)
print(movies)
sorted(list('file'), reverse=True)
attribute = 'gene_id "unknown gene"'
attribute.split(sep=' ', maxsplit=1)
# print(value, ..., sep=' ', end='\n', file=sys.stdout, flush=False)
print('x=', end='')
print('1')
open(file, mode='r', encoding=None) # some arguments omitted
open('files/recipes.txt', 'w', encoding='utf-8')
open('files/recipes.txt', mode='w', encoding='utf-8')
open('files/recipes.txt', encoding='utf-8', mode='w')
def format_sentence(subject, value, end):
return 'The ' + subject + ' is ' + value + end
print(format_sentence('lecture', 'ongoing', '.'))
print(format_sentence('lecture', 'ongoing', end='.'))
print(format_sentence(subject='lecture', value='ongoing', end='...'))
print(format_sentence(subject='lecture', 'ongoing', '.'))
def format_sentence(subject, value, end='.'):
return 'The ' + subject + ' is ' + value + end
print(format_sentence('lecture', 'ongoing'))
print(format_sentence('lecture', 'ongoing', '...'))
Nonedef format_sentence(subject, value, end='.', second_value=None):
if second_value is None:
return 'The ' + subject + ' is ' + value + end
else:
return 'The ' + subject + ' is ' + value + ' and ' + second_value + end
print(format_sentence('lecture', 'ongoing'))
print(format_sentence('lecture', 'ongoing',
second_value='self-referential', end='!'))
None¶returnbool(None)
None == False, None == 0
None¶None to the other false values such as 0, False and '' use is None:counts = {'drama': 2, 'romance': 0}
counts.get('romance'), counts.get('thriller')
counts.get('romance') is None
counts.get('thriller') is None
values = [None, 1, 0, '', '0', '1', [], [0]]
for x in values:
if x is None:
print(repr(x), 'is None')
if not x:
print(repr(x), 'is false')
if x:
print(repr(x), 'is true')
Controlling loops - break
for x in lines_in_a_big_file:
if x.startswith('>'): # this is the only line I want!
do_something(x)
...waste of time!
for x in lines_in_a_big_file:
if x.startswith('>'): # this is the only line I want!
do_something(x)
break # break the loop
Controlling loops - continue
for x in lines_in_a_big_file:
if x.startswith('>'): # irrelevant line
# just skip this! don't do anything
do_something(x)
for x in lines_in_a_big_file:
if x.startswith('>'): # irrelevant line
continue # go on to the next iteration
do_something(x)
for x in lines_in_a_big_file:
if not x.startswith('>'): # not irrelevant!
do_something(x)
Another control statement: pass - the placeholder
def a_function():
# I have not implemented this just yet
def a_function():
# I have not implemented this just yet
pass
a_function()
Check out the module index
How to find the right module?
How to understand it?
How to find the right module?
How to understand it?
import math
help(math.acosh)
help(str)
help(math.sqrt)
# install packages using: pip
# Dimitris' protip: install packages using conda
math.sqrt(3)
import math
math.sqrt(3)
import math as m
m.sqrt(3)
from math import sqrt
sqrt(3)
Remember help()?
Works because somebody else has documented their code!
def process_file(filename, chrom, pos):
"""
Read a vcf file, search for lines matching
chromosome chrom and position pos.
Print the genotypes of the matching lines.
"""
for line in open(filename):
if not line.startswith('#'):
col = line.split('\t')
if col[0] == chrom and col[1] == pos:
print(col[9:])
help(process_file)
help(process_file)
Your code may have two types of users:
Write documentation for both of them!
"""
What does this function do?
"""
# implementation details
At the beginning of the file
"""
This module provides functions for...
"""
`
For every function
def make_list(x):
"""Returns a random list of length x."""
pass
my_list[5] += other_list[3] # explain why you do this!
title = 'Toy Story'
rating = 10
print('The result is: ' + title + ' with rating: ' + str(rating))
# f-strings (since python 3.6)
print(f'The result is: {title} with rating: {rating}')
# format method
print('The result is: {} with rating: {}'.format(title, rating))
# the ancient way (python 2)
print('The result is: %s with rating: %s' % (title, rating))
Learn more from the Python docs: https://docs.python.org/3.9/library/string.html#format-string-syntax
DataFrame type:import pandas as pd
df = pd.DataFrame({
'age': [1,2,3,4],
'circumference': [2,3,5,10],
'height': [30, 35, 40, 50]
})
df
pd.read_table: tab separated values .tsvpd.read_csv: comma separated values .csvpd.read_excel: Excel spreadsheets .xlsx
For a data frame df: df.write_table(), df.write_csv(), df.write_excel()
!cat ../downloads/Orange_1.tsv
df = pd.read_table('../downloads/Orange_1.tsv')
df
age, circumference, heightdataframe.columnname
dataframe['columnname']
df.columns
df[['height', 'age']]
df.height
df[['age', 'circumference']].describe()
df['age'].std()
import math
df['radius'] = df['circumference'] / 2.0 / math.pi
df
dataframe.iloc[index]
dataframe.iloc[start:stop]
df.iloc[1:3]
!head -n 6 ../downloads/Orange.tsv
df = pd.read_table('../downloads/Orange.tsv') # , index_col=0)
df.iloc[0:5] # can also use .head()
df.Tree.unique()
type(pd.DataFrame({"genre": ['Thriller', 'Drama'], "rating": [10, 9]}).rating.iloc[0])
#young = df[df.age < 200]
#young
df[df.age < 1000]
df.loc[ df.age < 200 ]
df.head()
max_c = df.circumference.max()
print(max_c)
df[df.circumference == max_c]
df.columnname.plot()
small_df = pd.read_table('../downloads/Orange_1.tsv')
small_df.plot(x='age', y='height')
What if no plot shows up?
%pylab inline # jupyter notebooks
or
import matplotlib.plot as plt
plt.show()
df[['circumference', 'age']].plot.bar()
df[['circumference', 'age']].plot.bar(figsize=(12, 8), fontsize=16)
df.plot.scatter(x="column_name", y="other_column_name")
df.plot.scatter(x='age', y='circumference',
figsize=(12, 8), fontsize=14)
dataframe.plot.line(x=..., y=...)
tree1 = df[df['Tree'] == 1]
tree1.plot.line(x='age', y='circumference',
fontsize=14, figsize=(12,8))
df.groupby('Tree').plot.line(x='age', y='circumference')
Read the Orange_1.tsv
Use Pandas to read IMDB
Extra exercises: